{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def reduce_mem_usage_colwise(col):\n",
    "    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n",
    "    col_type = col.dtypes\n",
    "    if col_type in numerics:\n",
    "        c_min = col.min()\n",
    "        c_max = col.max()\n",
    "        if str(col_type)[:3] == 'int':\n",
    "            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                col = col.astype(np.int8)\n",
    "            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                col = col.astype(np.int16)\n",
    "            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                col = col.astype(np.int32)\n",
    "            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                col = col.astype(np.int64)  \n",
    "        else:\n",
    "            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                col = col.astype(np.float16)\n",
    "            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                col = col.astype(np.float32)\n",
    "            else:\n",
    "                col = col.astype(np.float64)    \n",
    "#     gc.collect()\n",
    "    return col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "import lightgbm as lgb\n",
    "\n",
    "from multiprocessing import Pool\n",
    "from functools import partial\n",
    "import gc\n",
    "from tqdm import tqdm\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 34.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "DATA_PATH = 'Data/'\n",
    "OUTPUT_PATH = 'Output/'\n",
    "\n",
    "train = pd.read_csv(DATA_PATH + 'train_data.csv')\n",
    "train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')\n",
    "test = pd.read_csv(DATA_PATH + 'test_data.csv')\n",
    "sample_sub = pd.read_csv(DATA_PATH + 'Sample Submission.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = pd.merge(train, train_labels, on = 'patient_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cat_cols = ['event_name',  'plan_type', 'specialty']\n",
    "num_cols = ['event_time', 'patient_payment']\n",
    "\n",
    "train[cat_cols] = train[cat_cols].apply(lambda x: x.astype('category'))\n",
    "test[cat_cols] = test[cat_cols].apply(lambda x: x.astype('category'))\n",
    "\n",
    "ID_COL = 'patient_id'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = train.drop_duplicates()\n",
    "test = test.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 9.48 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df = pd.concat([train, test])\n",
    "df[cat_cols] = df[cat_cols].apply(lambda x: pd.factorize(x)[0])\n",
    "df = df.sort_values(by=['patient_id', 'event_time'])\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cat_cols = ['event_name',  'plan_type', 'specialty']\n",
    "num_cols = ['event_time', 'patient_payment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del train, test\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "event_name_patient_id_grpd = df.groupby(['event_name', 'patient_id']).agg({'event_time': ['min'], 'patient_payment': ['sum', 'max'], 'plan_type': ['nunique'], 'specialty': ['nunique']})\n",
    "event_name_patient_id_grpd.columns = ['grpd_by_event_name_patient_id_' + '_'.join(c).strip('_') for c in event_name_patient_id_grpd.columns]\n",
    "df = pd.merge(df, event_name_patient_id_grpd, on = ['event_name', 'patient_id'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_cols = num_cols + event_name_patient_id_grpd.columns.tolist()\n",
    "del event_name_patient_id_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "specialty_patient_id_grpd = df.groupby(['specialty', 'patient_id']).agg({'event_time': ['min','max'], 'patient_payment': ['sum', 'max'], 'plan_type': ['nunique'], 'event_name': ['nunique']})\n",
    "specialty_patient_id_grpd.columns = ['grpd_by_specialty_patient_id_' + '_'.join(c).strip('_') for c in specialty_patient_id_grpd.columns]\n",
    "df = pd.merge(df, specialty_patient_id_grpd, on = ['specialty', 'patient_id'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_cols = num_cols + specialty_patient_id_grpd.columns.tolist()\n",
    "del specialty_patient_id_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plan_type_patient_id_grpd = df.groupby(['plan_type', 'patient_id']).agg({'event_time': ['min','max'], 'patient_payment': ['sum', 'max'], 'specialty': ['nunique'], 'event_name': ['nunique']})\n",
    "plan_type_patient_id_grpd.columns = ['grpd_by_plan_type_patient_id_' + '_'.join(c).strip('_') for c in plan_type_patient_id_grpd.columns]\n",
    "df = pd.merge(df, plan_type_patient_id_grpd, on = ['plan_type', 'patient_id'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_cols = num_cols + plan_type_patient_id_grpd.columns.tolist()\n",
    "del plan_type_patient_id_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "event_time_patient_id_grpd = df.groupby(['event_time', 'patient_id']).agg({'plan_type': ['nunique'], 'patient_payment': ['sum', 'max'], 'specialty': ['nunique'], 'event_name': ['nunique']})\n",
    "event_time_patient_id_grpd.columns = ['grpd_by_event_time_patient_id_' + '_'.join(c).strip('_') for c in event_time_patient_id_grpd.columns]\n",
    "df = pd.merge(df, event_time_patient_id_grpd, on = ['event_time', 'patient_id'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_cols = num_cols + event_time_patient_id_grpd.columns.tolist()\n",
    "del event_time_patient_id_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features = [c for c in df.columns if c not in ['patient_id', 'plan_type']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "event_name_grpd = df.groupby('event_name').agg({'event_time': ['min'], 'patient_id': ['nunique'], 'patient_payment': ['sum', 'max'], 'plan_type': ['nunique'], 'specialty': ['nunique']})\n",
    "event_name_grpd.columns = ['grpd_by_event_name_' + '_'.join(c).strip('_') for c in event_name_grpd.columns]\n",
    "df = pd.merge(df, event_name_grpd, on = 'event_name', how='left')\n",
    "\n",
    "num_cols = num_cols + event_name_grpd.columns.tolist()\n",
    "del event_name_grpd\n",
    "features = [c for c in df.columns if c not in ['patient_id']]\n",
    "df[features] = df[features].apply(lambda x: reduce_mem_usage_colwise(x))\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "specialty_grpd = df.groupby('specialty').agg({'event_time': ['min'], 'patient_id': ['nunique'], 'patient_payment': ['sum', 'max'], 'plan_type': ['nunique'], 'event_name': ['nunique', 'size']})\n",
    "specialty_grpd.columns = ['grpd_by_specialty_' + '_'.join(c).strip('_') for c in specialty_grpd.columns]\n",
    "df = pd.merge(df, specialty_grpd, on = 'specialty', how='left')\n",
    "\n",
    "num_cols = num_cols + specialty_grpd.columns.tolist()\n",
    "features = [c for c in df.columns if c not in ['patient_id']]\n",
    "df[features] = df[features].apply(lambda x: reduce_mem_usage_colwise(x))\n",
    "del specialty_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "plan_type_grpd = df.groupby('plan_type').agg({'event_time': ['min'], 'patient_id': ['nunique'], 'patient_payment': ['sum', 'max'], 'specialty': ['nunique'],\n",
    "                                              'event_name': ['nunique']})\n",
    "plan_type_grpd.columns = ['grpd_by_plan_type_' + '_'.join(c).strip('_') for c in plan_type_grpd.columns]\n",
    "df = pd.merge(df, plan_type_grpd, on = 'plan_type', how='left')\n",
    "\n",
    "num_cols = num_cols + plan_type_grpd.columns.tolist()\n",
    "features = [c for c in df.columns if c not in ['patient_id']]\n",
    "df[features] = df[features].apply(lambda x: reduce_mem_usage_colwise(x))\n",
    "del plan_type_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "event_time_grpd = df.groupby('event_time').agg({'plan_type': ['nunique'], 'patient_id': ['nunique'], 'patient_payment': ['sum', 'max'], 'specialty': ['nunique'], 'event_time': ['min'],\n",
    "                                               'event_name': ['nunique']})\n",
    "event_time_grpd.columns = ['grpd_by_event_time_' + '_'.join(c).strip('_') for c in event_time_grpd.columns]\n",
    "df = pd.merge(df, event_time_grpd, on = 'event_time', how='left')\n",
    "\n",
    "num_cols = num_cols + event_time_grpd.columns.tolist()\n",
    "features = [c for c in df.columns if c not in ['patient_id']]\n",
    "df[features] = df[features].apply(lambda x: reduce_mem_usage_colwise(x))\n",
    "del event_time_grpd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features = [c for c in df.columns if c not in ['patient_id']]\n",
    "df[features] = df[features].apply(lambda x: reduce_mem_usage_colwise(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DONE\n",
      "DONE\n"
     ]
    }
   ],
   "source": [
    "add_num_cols = []\n",
    "grp_name = 'patient_id'\n",
    "grp = df.groupby('patient_id')\n",
    "\n",
    "num_cumsum_cols = [c + grp_name + '_cumsum' for c in num_cols]\n",
    "add_num_cols = add_num_cols + num_cumsum_cols\n",
    "total_num_cols = len(num_cumsum_cols)\n",
    "\n",
    "#Broken into two parts to keep RAM USAGE low\n",
    "df[num_cumsum_cols[0:len(num_cumsum_cols)//2]] = df[num_cols[0:len(num_cols)//2]].apply(lambda x: reduce_mem_usage_colwise(grp[x.name].cumsum()))\n",
    "print(\"DONE\")\n",
    "gc.collect()\n",
    "df[num_cumsum_cols[len(num_cumsum_cols)//2:]] = df[num_cols[len(num_cols)//2:]].apply(lambda x: reduce_mem_usage_colwise(grp[x.name].cumsum()))\n",
    "print(\"DONE\")\n",
    "gc.collect()\n",
    "\n",
    "num_cols = list(set(num_cols + add_num_cols))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "add_freq_cols = cat_cols\n",
    "freq_cols = [c + '_freq' for c in add_freq_cols]\n",
    "df[freq_cols] = df[add_freq_cols].apply(lambda x: reduce_mem_usage_colwise(x.map(x.value_counts())))\n",
    "\n",
    "add_freq_cumsum = [c + grp_name + '_cumsum' for c in freq_cols]\n",
    "df[add_freq_cumsum] = df[freq_cols].apply(lambda x: reduce_mem_usage_colwise(grp[x.name].cumsum()))\n",
    "num_cols = num_cols + freq_cols + add_freq_cumsum\n",
    "\n",
    "df['patient_id_freq'] = df['patient_id'].map(df['patient_id'].value_counts())\n",
    "num_cols = num_cols + ['patient_id_freq']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "aggs = {}\n",
    "for c in cat_cols:\n",
    "    aggs[c] = ['nunique']\n",
    "for c in num_cols:\n",
    "    aggs[c] = ['sum', 'min', 'max', 'std']\n",
    "    \n",
    "def apply_aggs(col):\n",
    "    res_lists = []\n",
    "    all_aggs = aggs[col]\n",
    "    for agg_func in all_aggs:\n",
    "        ser = pd.Series(grp[col].agg(agg_func), name = col + '_' + agg_func)\n",
    "        res_lists.append(ser)\n",
    "    return res_lists\n",
    "\n",
    "grp = df.groupby('patient_id')\n",
    "try:\n",
    "    pool = Pool(4) \n",
    "    all_aggs = pool.map(apply_aggs, (col for col in aggs.keys()))\n",
    "finally: \n",
    "    pool.close()\n",
    "    pool.join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "all_aggs_flattened = [x for nested_list in all_aggs for x in nested_list]\n",
    "print(len(all_aggs_flattened))\n",
    "comp_df = pd.DataFrame(all_aggs_flattened).T\n",
    "comp_df = comp_df.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del all_aggs, all_aggs_flattened\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "comp_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ID_COL, TARGET_COL = 'patient_id', 'outcome_flag'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grp = df.groupby('patient_id')\n",
    "comp_df = pd.merge(comp_df, grp['patient_payment'].agg(lambda x: x.values[-1]).rename('patient_payment_last_value').reset_index(), on = 'patient_id', how ='left')\n",
    "comp_df = pd.merge(comp_df, grp['event_time'].agg(lambda x: x.values[-1]).rename('event_time_last_value').reset_index(), on = 'patient_id', how ='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features = [c for c in comp_df if c not in [ID_COL, TARGET_COL]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "comp_df[features] = comp_df[features].apply(lambda x: reduce_mem_usage_colwise(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "comp_train = pd.merge(train_labels, comp_df, how = 'inner', on = 'patient_id')\n",
    "comp_test = pd.merge(sample_sub, comp_df, how = 'inner', on = 'patient_id')\n",
    "target = comp_train[TARGET_COL]\n",
    "comp_train.shape, train_labels.shape, comp_test.shape, sample_sub.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del comp_df\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param = {'num_leaves': 5000,\n",
    "         'min_data_in_leaf': 400, \n",
    "         'objective':'binary',\n",
    "         'max_depth': -1,\n",
    "         'learning_rate': 0.01,\n",
    "         \"boosting\": \"gbdt\",\n",
    "         \"feature_fraction\": 0.55,\n",
    "         \"metric\": 'auc',\n",
    "         \"lambda_l1\": 6,\n",
    "         \"lambda_l2\": 2,\n",
    "         \"random_state\": 6,\n",
    "         \"verbosity\": -1,\n",
    "         'two_round': True,\n",
    "         'cat_smooth': 1,\n",
    "          'cat_l2':1,\n",
    "         'two_round': True}\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.80984\tvalid_1's auc: 0.731885\n",
      "[400]\ttraining's auc: 0.848279\tvalid_1's auc: 0.744236\n",
      "[600]\ttraining's auc: 0.879354\tvalid_1's auc: 0.749324\n",
      "[800]\ttraining's auc: 0.904028\tvalid_1's auc: 0.752293\n",
      "[1000]\ttraining's auc: 0.924389\tvalid_1's auc: 0.752224\n",
      "[1200]\ttraining's auc: 0.940718\tvalid_1's auc: 0.752241\n",
      "Early stopping, best iteration is:\n",
      "[851]\ttraining's auc: 0.909565\tvalid_1's auc: 0.753483\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.808651\tvalid_1's auc: 0.748934\n",
      "[400]\ttraining's auc: 0.847154\tvalid_1's auc: 0.756088\n",
      "[600]\ttraining's auc: 0.8784\tvalid_1's auc: 0.757948\n",
      "[800]\ttraining's auc: 0.903163\tvalid_1's auc: 0.758198\n",
      "[1000]\ttraining's auc: 0.923383\tvalid_1's auc: 0.757573\n",
      "Early stopping, best iteration is:\n",
      "[669]\ttraining's auc: 0.887528\tvalid_1's auc: 0.758926\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.80733\tvalid_1's auc: 0.744694\n",
      "[400]\ttraining's auc: 0.846629\tvalid_1's auc: 0.75751\n",
      "[600]\ttraining's auc: 0.877893\tvalid_1's auc: 0.760832\n",
      "[800]\ttraining's auc: 0.902928\tvalid_1's auc: 0.761477\n",
      "[1000]\ttraining's auc: 0.923342\tvalid_1's auc: 0.762864\n",
      "[1200]\ttraining's auc: 0.939719\tvalid_1's auc: 0.762704\n",
      "[1400]\ttraining's auc: 0.953086\tvalid_1's auc: 0.762936\n",
      "Early stopping, best iteration is:\n",
      "[1070]\ttraining's auc: 0.929515\tvalid_1's auc: 0.76328\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.806716\tvalid_1's auc: 0.761517\n",
      "[400]\ttraining's auc: 0.846102\tvalid_1's auc: 0.773902\n",
      "[600]\ttraining's auc: 0.876999\tvalid_1's auc: 0.779904\n",
      "[800]\ttraining's auc: 0.901942\tvalid_1's auc: 0.782665\n",
      "[1000]\ttraining's auc: 0.922749\tvalid_1's auc: 0.784596\n",
      "[1200]\ttraining's auc: 0.939581\tvalid_1's auc: 0.785574\n",
      "[1400]\ttraining's auc: 0.952807\tvalid_1's auc: 0.784301\n",
      "Early stopping, best iteration is:\n",
      "[1166]\ttraining's auc: 0.937124\tvalid_1's auc: 0.785666\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.807611\tvalid_1's auc: 0.734187\n",
      "[400]\ttraining's auc: 0.846902\tvalid_1's auc: 0.744127\n",
      "[600]\ttraining's auc: 0.877899\tvalid_1's auc: 0.745569\n",
      "[800]\ttraining's auc: 0.902683\tvalid_1's auc: 0.745987\n",
      "[1000]\ttraining's auc: 0.922855\tvalid_1's auc: 0.746274\n",
      "[1200]\ttraining's auc: 0.938977\tvalid_1's auc: 0.745901\n",
      "[1400]\ttraining's auc: 0.952058\tvalid_1's auc: 0.744179\n",
      "Early stopping, best iteration is:\n",
      "[1079]\ttraining's auc: 0.929611\tvalid_1's auc: 0.746761\n",
      "fold n°5\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.808571\tvalid_1's auc: 0.734135\n",
      "[400]\ttraining's auc: 0.848042\tvalid_1's auc: 0.738569\n",
      "[600]\ttraining's auc: 0.878719\tvalid_1's auc: 0.739827\n",
      "[800]\ttraining's auc: 0.90329\tvalid_1's auc: 0.740854\n",
      "[1000]\ttraining's auc: 0.922998\tvalid_1's auc: 0.742184\n",
      "[1200]\ttraining's auc: 0.939091\tvalid_1's auc: 0.742808\n",
      "[1400]\ttraining's auc: 0.952145\tvalid_1's auc: 0.74202\n",
      "[1600]\ttraining's auc: 0.962569\tvalid_1's auc: 0.741616\n",
      "Early stopping, best iteration is:\n",
      "[1256]\ttraining's auc: 0.942952\tvalid_1's auc: 0.743227\n",
      "fold n°6\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.807511\tvalid_1's auc: 0.765933\n",
      "[400]\ttraining's auc: 0.84756\tvalid_1's auc: 0.770462\n",
      "[600]\ttraining's auc: 0.878661\tvalid_1's auc: 0.77144\n",
      "[800]\ttraining's auc: 0.903708\tvalid_1's auc: 0.771555\n",
      "[1000]\ttraining's auc: 0.923989\tvalid_1's auc: 0.770772\n",
      "Early stopping, best iteration is:\n",
      "[736]\ttraining's auc: 0.896035\tvalid_1's auc: 0.772034\n",
      "fold n°7\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.806704\tvalid_1's auc: 0.73488\n",
      "[400]\ttraining's auc: 0.847109\tvalid_1's auc: 0.746564\n",
      "[600]\ttraining's auc: 0.877639\tvalid_1's auc: 0.752984\n",
      "[800]\ttraining's auc: 0.903027\tvalid_1's auc: 0.754666\n",
      "[1000]\ttraining's auc: 0.923515\tvalid_1's auc: 0.756388\n",
      "[1200]\ttraining's auc: 0.939912\tvalid_1's auc: 0.757681\n",
      "[1400]\ttraining's auc: 0.953336\tvalid_1's auc: 0.759674\n",
      "[1600]\ttraining's auc: 0.963702\tvalid_1's auc: 0.760544\n",
      "[1800]\ttraining's auc: 0.972132\tvalid_1's auc: 0.761486\n",
      "[2000]\ttraining's auc: 0.978546\tvalid_1's auc: 0.761224\n",
      "[2200]\ttraining's auc: 0.9836\tvalid_1's auc: 0.761163\n",
      "[2400]\ttraining's auc: 0.987619\tvalid_1's auc: 0.761644\n",
      "[2600]\ttraining's auc: 0.99079\tvalid_1's auc: 0.761725\n",
      "Early stopping, best iteration is:\n",
      "[2332]\ttraining's auc: 0.986338\tvalid_1's auc: 0.761849\n",
      "fold n°8\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.809396\tvalid_1's auc: 0.729246\n",
      "[400]\ttraining's auc: 0.848497\tvalid_1's auc: 0.736351\n",
      "[600]\ttraining's auc: 0.878398\tvalid_1's auc: 0.739658\n",
      "[800]\ttraining's auc: 0.903381\tvalid_1's auc: 0.74087\n",
      "[1000]\ttraining's auc: 0.923679\tvalid_1's auc: 0.741175\n",
      "[1200]\ttraining's auc: 0.939972\tvalid_1's auc: 0.741098\n",
      "Early stopping, best iteration is:\n",
      "[956]\ttraining's auc: 0.919575\tvalid_1's auc: 0.741783\n",
      "fold n°9\n",
      "Training until validation scores don't improve for 400 rounds\n",
      "[200]\ttraining's auc: 0.808003\tvalid_1's auc: 0.754101\n",
      "[400]\ttraining's auc: 0.848552\tvalid_1's auc: 0.754749\n",
      "[600]\ttraining's auc: 0.878663\tvalid_1's auc: 0.755386\n",
      "[800]\ttraining's auc: 0.903196\tvalid_1's auc: 0.755538\n",
      "[1000]\ttraining's auc: 0.923393\tvalid_1's auc: 0.756402\n",
      "[1200]\ttraining's auc: 0.939601\tvalid_1's auc: 0.756276\n",
      "[1400]\ttraining's auc: 0.952741\tvalid_1's auc: 0.756184\n",
      "Early stopping, best iteration is:\n",
      "[1098]\ttraining's auc: 0.931698\tvalid_1's auc: 0.75684\n",
      "CV score: 0.75778 \n"
     ]
    }
   ],
   "source": [
    "max_iter = 10\n",
    "folds = StratifiedKFold(n_splits=max_iter, random_state=1991)\n",
    "oof = np.zeros(len(comp_train))\n",
    "categorical_columns = None\n",
    "predictions_test = np.zeros(len(comp_test))\n",
    "feature_importance_df = pd.DataFrame()\n",
    "\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(comp_train.values, target)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    X_trn, y_trn = comp_train.iloc[trn_idx][features], target.iloc[trn_idx]\n",
    "    X_val, y_val = comp_train.iloc[val_idx][features], target.iloc[val_idx]\n",
    "    X_comp_test = comp_test[features]\n",
    "    trn_data = lgb.Dataset(X_trn, y_trn, categorical_feature = categorical_columns)\n",
    "    val_data = lgb.Dataset(X_val, y_val, categorical_feature = categorical_columns)\n",
    "    num_round = 500000\n",
    "    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 400)\n",
    "    oof[val_idx] = clf.predict(X_val, num_iteration=clf.best_iteration)\n",
    "    \n",
    "    del trn_data, val_data, X_trn, X_val, y_trn, y_val\n",
    "    \n",
    "    fold_importance_df = pd.DataFrame()\n",
    "    fold_importance_df[\"feature\"] = features\n",
    "    fold_importance_df[\"importance\"] = clf.feature_importance(importance_type='gain')\n",
    "    fold_importance_df[\"fold\"] = fold_ + 1\n",
    "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "\n",
    "    current_comp_test_pred = clf.predict(X_comp_test, num_iteration=clf.best_iteration)\n",
    "    predictions_test += current_comp_test_pred / folds.n_splits\n",
    "    \n",
    "    del X_comp_test, current_comp_test_pred, clf\n",
    "    \n",
    "    gc.collect()\n",
    "\n",
    "print(\"CV score: {:<8.5f}\".format(roc_auc_score(target, oof)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f65515eb198>"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD8CAYAAAB0IB+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl8XOV97/HPT/u+WJstbzJ4wWazQWGpyxIwlAIFkpsmpCUpLQ1JoVnvq2na5HVv0i1J01BCQpM4JC1NQpKW5CYQSMtm1rIZY2y8YeMF29iWbMtaRstoRs/9Y2aELGsZyXPmzMz5vl8vvTSeOZ7zsyx959HvPPM85pxDRERyX57fBYiISHoo8EVEAkKBLyISEAp8EZGAUOCLiASEAl9EJCAU+CIiAaHAFxEJCAW+iEhAFPhdwEj19fWupaXF7zJERLLGK6+8ctg515DMsRkV+C0tLaxdu9bvMkREsoaZ7Un2WLV0REQCQoEvIhIQnga+mdWY2f1mttXMtpjZhV6eT0RExud1D/8bwH85595nZkVAmcfnExGRcXgW+GZWBVwM3AzgnAsDYa/OJyIiE/OypXMK0A78q5m9amb3mFm5h+cTEZEJeBn4BcA5wLedcyuAEPC50QeZ2a1mttbM1ra3t3tYjohIsHkZ+PuAfc65F+N/vp/YC8BxnHOrnXOtzrnWhoak3jsgHusZiHDPMzvZdrDb71JEJIU8C3zn3EFgr5ktid91ObDZq/NJ6vzNg5v4u4e28Affe4Gu/kG/yxGRFPF6Hv7HgR+b2QZgOfAPHp9PTtJAJMoDr73NaTMrORIK89CGA36XJCIp4mngO+fWx9s1ZznnbnDOdXh5Pjl5a3d30D84xF/8zhJObSjnV+v3+12SiKSI3mkrx3lx5xHyDC44pY7Llzaxbs8x+sJRv8sSkRRQ4Mtxth3qpqWunPLiAi48pY5wdIhX9ugXM5FcoMCX42w/1MOipgoAzm2pBWD9XgW+SC5Q4MuwgUiU3UdCLG6qBKCqpJCWujJe39/lc2UikgoKfBm2v6OPIQctde+8IfqM2dVs3N/pY1UikioKfBl2sLMfgFk1JcP3nTG7mv3H+ugIaRkkkWynwJdhBxKBX106fN+Zs6sB2PS22joi2U6BL8MOdsUCf2bVOyP8RD9/e5uWWRDJdgp8GXags4/askJKi/KH76uvKKK6tJAdbT0+ViYiqaDAl2EHO/uZOaKdA2BmLGysYLsCXyTrKfBl2IHOfmZVl5xw/8KGCt5U4ItkPQW+DIuN8E8M/EVNFRwJhTVTRyTLKfAFgMHoEEdCYZoqTwz8Uxtj77zd0a5Rvkg2U+ALAB29sdF7XUXRCY8tbIgHvto6IllNgS8AHI23a2aUnxj4s2tKKSnMUx9fJMsp8AV4J/Bry04M/Lw8Y/6McnYf6U13WSKSQgp8AaAjFNvKcKwRPkBLfRm7j4TSWZKIpJgCXwA4Gu/h15YXjvl4S105bx3pJTrk0lmWiKSQAl8AhqdcjtXSAWipLyccHeJAZ186yxKRFFLgCxDr4VeWFFCYP/a3xPy6MgD2qI8vkrUU+ALEpmWO178HWFAfWyN/12H18UWylQJfgNgIf7x2DkBTZQnFBXns0YVbkaylwBdg8hF+Xp7RUlfOrsNq6YhkKwW+ALFpmRON8CHWx9cIXyR7FXj55Ga2G+gGokDEOdfq5flk+o71hqkuHXtKZkJLfTlPvtHO0JAjL8/SVJmIpIqngR/3bufc4TScR6YpEh0iFI5OHvh15YQjQxzo6md2TemEx4pI5lFLR+gZiABQWTLx639LfGrmbs3UEclKXge+Ax4xs1fM7FaPzyXT1NUXC/yqJFo6gJZYEMlSXrd0Vjrn3jazRuBRM9vqnHt65AHxF4JbAebNm+dxOTKWrv7YOjpVk4zwZ1bFpmZqhC+SnTwd4Tvn3o5/bgP+H3DeGMesds61OudaGxoavCxHxtHVFw/8SUb4eXnG/LoyTc0UyVKeBb6ZlZtZZeI2cCXwulfnk+nr6o+3dEomDnyA+XXlmpopkqW8HOE3Ac+a2WvAS8BDzrn/8vB8Mk2Jls5kF20htsTCnqO9DGnVTJGs41kP3zm3Ezjbq+eX1Em2pQOamimSzTQtU+jqj2AGlcWTv/5raqZI9lLgC939g1QUFyT17tkWrZopkrUU+EJXXySpC7bwztRMXbgVyT4KfKGrfzCpC7agVTNFspkCX+jqG0zqgm3C/DptaC6SjRT4Qnd/ZNJ32Y60oF4bmotkIwW+0DMQoTLJHj5oQ3ORbKXAF0IDEcqK8pM+fv7w1Ez18UWyiQJfCIUjVCQxBz9hgVbNFMlKCvyAi0SH6B8coqwo+cBvqiyhpFCrZopkGwV+wPUORgEoL06+pZOXZ8yfUa4RvkiWUeAHXCi+21X5FFo6AC31Zew+oh6+SDZR4AdcaCA2wp/KRVuIzdTR1EyR7KLAD7jECH8qF20BFtTFpma+fUxTM0WyhQI/4ELhWOBP5aItxDZCAc3UEckmCvyAS7R0pjzCH56aqT6+SLZQ4Adcb2KEP4VZOgBNVcWamimSZRT4AdczzR6+WWLVTAW+SLZQ4Adc7zRn6QAsbKxge1t3qksSEY8o8AMuMcKf6kVbgMVNlezr6BtuC4lIZlPgB1xvOEJpYT75SWxvONqixgqcg53tauuIZAMFfsD1DESn/C7bhEVNFQC8cUhtHZFsoMAPuN5wZErr6Iw0v66cgjxje1tPiqsSES8o8AMuNBChfBr9e4DC/DwW1Jez/ZACXyQbKPADLjQQnfYIH2IXbjVTRyQ7eB74ZpZvZq+a2a+9PpdMXSgcmXYPH2JTM9862kt/fJllEclc6RjhfxLYkobzyDScTEsHYhdunYM329XWEcl0nga+mc0BrgHu8fI8Mn0n29JZ1FgJwA5duBXJeF6P8O8EPgsMjXeAmd1qZmvNbG17e7vH5chooXBkWm+6SmipL6Mgz9h6UH18kUznWeCb2bVAm3PulYmOc86tds61OudaGxoavCpHxuCcIzQwtQ3MRysuyGdhYwWb3+5KYWUi4gUvR/grgevMbDfwU+AyM/uRh+eTKRqIDDHkpr5S5minN1ezSYEvkvE8C3zn3F855+Y451qAG4EnnHM3eXU+mbrprpQ52unNVRzuGaCtqz8VZYmIRzQPP8DeWSnz5AL/jNnVALz+dudJ1yQi3klL4DvnnnTOXZuOc0ny3hnhn1xLZ+ms2EydTfvV1hHJZBrhB1jvNPezHa2ypJAF9eXq44tkOAV+gCVG+CfzTtuEZc1VbDqglo5IJlPgB1hvONbDP5k3XiWc0VzN3qN9dITCJ/1cIuINBX6ADY/wT7KlA3DOvBoAXt3bcdLPJSLeUOAHWG8KWzpnzamhIM94ZY8CXyRTKfADLJTClk5pUT7LmqtYt+fYST+XiHhDgR9goYEIBXlGUX5qvg3OmVfL+r3HiETHXTpJRHykwA+w0EBsLXyzqW9gPpZz5tfSNxjVQmoiGUqBH2ChcJTyopNv5yScO78WQH18kQylwA+wxAg/VZqrS2iuLuHFXUdS9pwikjoK/AALhaOUpTDwzYwLT63n+TePMDTkUva8IpIaCvwAi21vmLqWDsDKhXV09A6y+YCWWRDJNEkFvpn93MyuMTO9QOSQVLd0AFYurAfgf948nNLnFZGTl2yAfxv4A2C7mX3FzE7zsCZJk1A49SP8pqoSFjZW8NwO9fFFMk1Sge+ce8w594fAOcBu4FEz+x8z+2MzK/SyQPFO70A05SN8gJWn1vHSrqOEI5qPL5JJkm7RmFkdcDPwp8CrwDeIvQA86kll4rkeD1o6ABctaqBvMMrLu4+m/LlFZPqS7eH/AngGKAN+zzl3nXPuZ865jwMVXhYo3ohEhxiIDKVk4bTRVi6sp7ggj8e2HEr5c4vI9CU7wr/HObfMOfdl59wBADMrBnDOtXpWnXgmlevojFZalM/KhfU8vqUN5zQ9UyRTJBv4fzfGfc+nshBJr8RuV160dAAuO62Rt4728mZ7yJPnF5Gpm/Cn3cxmArOBUjNbASQWXaki1t6RLBUaSGxvmPoRPsQCH+DxLYdY2Kiun0gmmGx49zvELtTOAe4YcX838Nce1SRpEBqItXQqPBrhN9eUsnRWFY9vaeOjl5zqyTlEZGom/Gl3zt0L3Gtm/8s59/M01SRp8M4I35vAB1i1tJG71+ygIxSmtrzIs/OISHIm7OGb2U3xmy1m9pnRH2moTzySuGjr1Qgf4IplTQw5eGJrm2fnEJHkTXbRtjz+uQKoHONDstTwCN+DWToJZ86uZmZVCY9u1vRMkUwwWUvnu/HPX5rqE5tZCfA0UBw/z/3Ouf87nSIl9ULxWTpejvDNjFXLGvnFuv30D0YpKfTuxUVEJpfsG6/+0cyqzKzQzB43s8Mj2j3jGQAuc86dDSwHrjKzC062YEkNr2fpJKxa2kRvOKrF1EQyQLLz8K90znUB1wL7gMXAX0z0F1xMT/yPhfEPvQsnQyRm6Xh50RbgwlPrqCguUFtHJAMkG/iJBdKuBn7inEtqkRQzyzez9UAb8Khz7sVp1CgeCA1EKC3MJz8vNfvZjqe4IJ9LFjfw2JY2bYoi4rNkA/9BM9sKtAKPm1kD0D/ZX3LORZ1zy4nN4z/PzM4YfYyZ3Wpma81sbXt7+1Rql5MQCnuzUuZYrljWRHv3AOv3HUvL+URkbMkuj/w54EKg1Tk3CISA65M9iXPuGPAkcNUYj612zrU651obGhqSfUo5SbHNT9JzEfXdSxrJzzO1dUR8NpUdrJYCHzCzDwPvA66c6GAzazCzmvjtUmAVsHW6hUpq9YYjnqyUOZbqskLOXzBDgS/is2Rn6fwQ+Cfgt4F3xT8mWyVzFrDGzDYALxPr4f/6JGqVFOpJ4wgfYm2dHW097DqsxdRE/JLsEK8VWOamsNatc24DsGJaVYnnesNRZqRxuYMrljXxpQc38+jmg9x6sdbWEfFDsi2d14GZXhYi6dUzkL6WDsCc2jKWzqpSW0fER8n+xNcDm83sJWJvqALAOXedJ1WJ53oHop6/6Wq0K5Y18a0ntnOkZ4C6iuK0nltEkg/8L3pZhKRfyKP9bCdy5bIm7np8O09sbeP3W+em9dwikvy0zKeA3UBh/PbLwDoP6xIPOecIhSOerqMzltObq5hVrcXURPyS7CydjwD3A9+N3zUb+KVXRYm3+gajDDnvtjccj5mxamkTT29vp38wmtZzi0jyF21vB1YCXQDOue1Ao1dFibd6BhIrZaZ/9cpVy5roHxzi+TePpP3cIkGXbOAPOOfCiT+YWQFaCC1rJRZOS/cIH+D8BTMoLczXpigiPkg28J8ys78mtpn5FcB/Ag96V5Z4KbE0sh+BX1KYz8qF9TyxtY0pvK1DRFIg2cD/HNAObAQ+CjwMfMGrosRboQHvNz+ZyGWnNbL/WB/b23omP1hEUiapn3jn3JCZ/RL4pXNOS1pmucRuV+meh59w6ZLYInlrtraxuEk7ZYqky2SbmJuZfdHMDhNb+GybmbWb2f9JT3nihZ4B7zcwn0hzTSmnzaxUH18kzSZr6XyK2Oycdznn6pxzM4DzgZVm9mnPqxNP+NnDT7jstEbW7umgs2/QtxpEgmaywP8w8EHn3K7EHc65ncBN8cckC2VK4EeHHM9sV4dQJF0mC/xC59wJu0/H+/iFYxwvWSAxD7/cpx4+wIp5tdSUFaqtI5JGkwV+eJqPSQYLDUQoKcyjIH8q+9+kVn6eccniBp7a1q69bkXSZLKf+LPNrGuMj27gzHQUKKkXCkd9u2A70ruXNHIkFOY17XUrkhYT/tQ75/z7nV88ExqIUJbGtfDHc8niBvIsNj1zxbxav8sRyXn+/U4vvvFjaeSx1JYXsWJeLWu26cKtSDoo8AOoZyDiy8JpY7nstEY27u+kravf71JEcp4CP4BCA9GMGOFDrI8P8KRG+SKeU+AHUKa0dACWzqpkVnWJpmeKpIECP4B6BiJUZMBFW4htinLpkkae0aYoIp5T4AdQbzhzWjoA15w5i1A4qlG+iMcU+AGT2M+2PEMu2gJceGodDZXF/Gr9fr9LEclpCvyA6Q1HcT7sZzuR/Dzj985qZs3Wdjp7tZiaiFc8C3wzm2tma8xsi5ltMrNPenUuSV4mLJw2luuXNxOODvHw6wf8LkUkZ3k5wo8A/9s5txS4ALjdzJZ5eD5Jgp8bmE/krDnVLGqs4CcvveV3KSI5y7PAd84dcM6ti9/uBrYAs706nyRneAPzDJmlk2Bm3HTBfDbs6+S1vVpbR8QLaenhm1kLsAJ4cYzHbjWztWa2tr1db77xWo/P+9lO5D3nzKasKJ8fvbDH71JEcpLngW9mFcDPgU8557pGP+6cW+2ca3XOtTY0NHhdTuBlag8foKqkkBtWzOaB197mcM+A3+WI5BxPA9/MComF/Y+dc7/w8lySnMQG5pk0LXOkW357AeHoEN9/dtfkB4vIlHg5S8eA7wNbnHN3eHUemZrhHn4GjvABTm2o4OozZ/HD5/doiqZIink5wl8JfAi4zMzWxz+u9vB8koTu/liIVpVk7g6Vt1+6kJ6BCD94TqN8kVTybJjnnHsWMK+eX6anq3+Q/DyjzMf9bCezrLmK3z1jJt97Zid/eME8GitL/C5JJCfonbYB090fobKkgFjHLXN99qrTCEeGuPOx7X6XIpIzFPgB09U3SGVJZvbvR1pQX84fnj+Pn728lx1t3X6XI5ITFPgB090fobI4c/v3I33i8kWUFebz9w9t8bsUkZygwA+Yrv5Bqkozf4QPUFdRzCcuX8Sabe08tvmQ3+WIZD0FfsDEevjZMcIHuHllC4saK/jSrzdpgxSRk6TAD5ju/khGT8kcrTA/jy9dfzp7j/bxnafe9LsckaymwA+YbLloO9JvnVrPtWfN4ttPvsneo71+lyOStRT4ATI05OgJR6gqzZ4RfsLnr1lKfp7xpQc3+12KSNZS4AdI90AE56Aqy0b4ALOqS/nE5Yt4bMsh1mjvW5FpUeAHSDYsqzCRP1m5gFMbyvnig5sYiOgCrshUKfADpKsvtlJmtvXwE4oK8vjidaez50ivVtMUmQYFfoAMj/CzsIefcNGiBq5Y1sS3ntjBoa5+v8sRySoK/ADp7s/uEX7CF65ZSiTq+OpvtvpdikhWUeAHSFeW9/AT5teV85GLF/CLV/ez7q0Ov8sRyRoK/ADJlRE+wG2XLqSpqpgvPrCJoSHndzkiWUGBHyBdfbERfjYtrTCe8uIC/up3l7JhXyf3r9vndzkiWUGBHyDdAxFKCvMoKsiN//brlzezYl4NX/vvbcObs4vI+HLjJ1+SEltWIftH9wlmxv+5dhnt3QN8+0mtsyMyGQV+gBzrHaQmi6dkjmXFvFres2I2q5/Zyb4OrbMjMhEFfoB09IapLSvyu4yU++xVS8gz+LKmaYpMSIEfIMd6B6ktz60RPsTW2fnYJafy0IYDvLz7qN/liGQsBX6A5OoIH+CjF5/KrOoS/ubBzZqmKTIOBX5AOOfo6A1Tk6OBX1qUz19edRob93fy78/v9rsckYykwA+IUDjKYNRRW5Z7LZ2E65c38+4lDXz5N1vZ0dbtdzkiGcezwDezH5hZm5m97tU5JHkdoTBAzrZ0IDZN86vvO4uyonz+/L5XhxeLE5EYL0f4/wZc5eHzyxQc642FX00Oj/ABGitLuOuDK9je1sNtP16njc9FRvAs8J1zTwOaMpEhOnpjI/wZ5bk7wk+4aFEDX37PmTyz/TB/9IOX6OzTSF8E1MMPjETg5+pF29He/665fOPG5byyp4Nrv/kMr+zRqpoivge+md1qZmvNbG17e7vf5eSsREsnly/ajnb98tn87KMX4By8/7vP8/VHthGODPldlohvfA9859xq51yrc661oaHB73JyVmKEX51jSytM5tz5M3j4kxdx/fJmvvnEDq771rO8vr/T77JEfOF74Et6HOkJU1NWSEF+8P7Lq0oKueP9y7nnw60cCYW54e7nuPOxNxiMarQvweLltMyfAM8DS8xsn5nd4tW5ZHJt3f00Vhb7XYavVi1r4tFPX8y1Z83izse2c8Pdz7HnSMjvskTSxstZOh90zs1yzhU65+Y4577v1blkcu3dAzRWlvhdhu9qyoq488YVfPdD57L/WB/X3/0cL+w84ndZImkRvN/vA6qte4CGgI/wR/qd02fyy9tWUldexIe+/yKPbDrod0kinlPgB4BzjnYF/gla6sv5xW0rOb25mtt+vI7fbDzgd0kinlLgB0BXf4SByFDge/hjqS4t5Ie3nMfZc2v485+8qtCXnKbAD4D27gEAjfDHUVlSyL1/ch4r5tbw8Z+8qvaO5CwFfgC0dfcD0FChwB9PRXEB//rH7+KM2dXcft86Ht9yyO+SRFJOgR8AiRF+Y5UCfyKJkf7SWVX82Y/W8eS2Nr9LEkkpBX4AtHXFWzoVmpY5merSQv79T85jYWMFt/7wFZ7dftjvkkRSRoEfAPuP9VFRXEBVaYHfpWSFmrIifvyn53NKfTm33Psyv1q/3++SRFJCgR8Ae4/2Mqe2FDPzu5SsUVtexH0fuYCz59bwyZ+u5x8e3qKF1yTrKfADYF9HH3NnlPldRtaZUV7Ej245n5sumMfqp3fynn95ju2HtHWiZC8Ffo5zzrG3IzbCl6krKsjj7244k9UfOpeDnf1c881n+d7TO4kOOb9LE5kyBX6OOxoK0xuOMrdWI/yTceXpM/mvT13MxYsa+PuHt/Def3mOrQe7/C5LZEoU+DluX0cfgEb4KdBQWcz3Pnwu3/zgCvZ19HHtXc9yxyPbGIho31zJDgr8HLe3oxdAPfwUMTN+7+xmHvvMJVx3djN3PbGDa+56lhe14qZkAQV+jtvR1oMZtNSV+11KTqktL+KODyzn3/74XfSFo3xg9QvcdM+LvLDzCM6pvy+ZSYGf47Yf6mHejDJKi/L9LiUnXbqkkUc/czGfv3opWw92cePqF7j8jqf4zlNvDi9pIZIpFPg57o1D3SxqrPS7jJxWVlTARy4+hWc+exlfe99ZzCgr4iu/2cpvffkJbr9vnUb9kjH01ssc1j8YZdfhEFee3uR3KYFQWpTP77fO5fdb57KjrYefvvQW/7F2Lw9tOMCyWVV84vKFXLlsJnl5egOc+EMj/Bz2+v5OIkOOs+fU+F1K4CxsrOAL1y7jxb9exVfeeyZ9g1E+9qN1XH3XMzy04QBDmscvPlDg57D1e48BsHyeAt8vpUX53HjePB799MXc+YHlDEaHuP2+daz656f42ctvaUqnpJUCP4e9uOsoc2eUavPyDFCQn8cNK2bzyKcv4ZsfXEFJQT5/+fONXPTVNdy9ZgcHOvv8LlECQD38HBWODPE/Ow5z/YrZfpciI+TnxebxX3vWLJ7dcZjvPPUmX/vvbfzTI9u48JQ6Ll/axAWnzGDpzCr1+iXlFPg56vmdRwiFo1y6uMHvUmQMZsZFixq4aFEDuw6H+NX6/Tzw2tv87a83A1BZUsAZzdWcNaeaM+dUc9bsGubO0IqncnIU+Dnq/lf2UV1ayCVLFPiZbkF9OZ9atZhPrVrM28f6eGHnEV7Z08HG/Z384LldDEZjF3irSwtjLwCzEy8ENTRXl+hFQJLmaeCb2VXAN4B84B7n3Fe8PJ/E7Doc4uGNB7j5t1ooLtAbrrJJc00p7z1nDu89Zw4AA5EobxzsYcP+Y2zc18nG/Z2sfnonkfgsn7ryIs6IvwAsmVlJS105LfXlVBRrLCcn8uy7wszygbuBK4B9wMtm9oBzbrNX55RYQPzl/RsoKcjjo5ec4nc5cpKKC/I5M97W4fzYff2DUbYe7GbjvmNsiL8I3L2mnZEzPesrillQX8b8unJa6sqYVV3KjPIiasoK45+LqCwu0HWCgPFyGHAesMM5txPAzH4KXA8o8D0QGoiw7q0Ovv7IG6zfe4xv3Lhcs3NyVElhPsvn1rB87jvTbfvCsTfZ7T4S/zgcYveRXp5+o53745vYj5ZnsTZRbVkR1WWF1MRvV5UWxj5KCqgevl0Yv11AZUkhxQV5FBfkqZ2UZbwM/NnA3hF/3sfwGCW1rv3mM/QPDh339vXj3tbiTrxvrGNHvvvdjTg6cf94746f1nMd9/dPLPb4vz/xuYaGHF39ESD2K/63/mAF157VPHaxkpNKi/JZ1lzFsuaqEx7rDUdo6xqgozdMR2+Yo6FBjvWG6ewbpKM3zLHeQTr7BjncE2Z7Ww+dfYN0x7+fJlOYbxTl55GfZ8PhP/I1YOTLwcgXh+PvH/mMNub94x1vJH/O485i49xO6vx2wn1M8d88Wm1ZEf/xsQsnOCI1vAz8sf59J0Smmd0K3Aowb968aZ1oYUPF8IWtyb7w43/jHH/c6GOZ0jfZ8cedcOzw7Ym/uabyjdhYVcKSpkp+e1E9JYXq28s7yooKaKkvoIXkV0yNDjl6+iN09cdeDLr6Bodvd/dHGIgMEY4MEY7GPid2ABtv0DXZAGii4xnv+OHB09Seb+Tx49yc9N+RzLGMd84xVJUUTvh4qngZ+PuAuSP+PAd4e/RBzrnVwGqA1tbWab3f/M4bV0znr4nIOPLzjOqyQqrLCo/7IZbs5uU7bV8GFpnZAjMrAm4EHvDwfCIiMgHPRvjOuYiZ/Tnw38SmZf7AObfJq/OJiMjEPJ2s65x7GHjYy3OIiEhytHiaiEhAKPBFRAJCgS8iEhAKfBGRgFDgi4gEhLnx1gvwgZm1A3tG3FUPHPapnKnKllpVZ+plS63ZUidkT62ZUOd851xS66BnVOCPZmZrnXOtfteRjGypVXWmXrbUmi11QvbUmi11JqilIyISEAp8EZGAyPTAX+13AVOQLbWqztTLllqzpU7InlqzpU4gw3v4IiKSOpk+whcRkRTJqMA3sxlm9qiZbY9/rp3g2Coz229m30pnjfFzT1qnmS03s+fNbJOZbTCzD6S5xqvMbJuZ7TCzz43xeLGZ/Sz++Itm1pLO+kbUMVmdnzGzzfGv4eNmNt+POuO1TFjriOPeZ2bOzHyZvZFMnWb2/vjXdZOZ3ZfuGkfUMdn//zwzW2Nmr8a/B672qc4fmFmbmb38RaLsAAAD/klEQVQ+zuNmZnfF/x0bzOycdNeYFOdcxnwA/wh8Ln77c8BXJzj2G8B9wLcysU5gMbAofrsZOADUpKm+fOBN4BSgCHgNWDbqmNuA78Rv3wj8zIevYzJ1vhsoi9/+Mz/qTLbW+HGVwNPAC0BrJtYJLAJeBWrjf27M1K8psR75n8VvLwN2+1TrxcA5wOvjPH418Btim9BdALzoR52TfWTUCJ/YJuf3xm/fC9ww1kFmdi7QBDySprpGm7RO59wbzrnt8dtvA21AUm+OSIHhDeSdc2EgsYH8SCP/DfcDl1v6d6SetE7n3BrnXG/8jy8Q2znND8l8TQH+ltiAoD+dxY2QTJ0fAe52znUAOOfa0lxjQjK1OiCxUW81Y+yalw7OuaeBoxMccj3w7y7mBaDGzGalp7rkZVrgNznnDgDEPzeOPsDM8oCvA3+R5tpGmrTOkczsPGIjmDfTUBuMvYH87PGOcc5FgE6gLi3VjVFD3Fh1jnQLsVGUHyat1cxWAHOdc79OZ2GjJPM1XQwsNrPnzOwFM7sqbdUdL5lavwjcZGb7iO2t8fH0lDZlU/1e9oWnG6CMxcweA2aO8dDnk3yK24CHnXN7vRyQpqDOxPPMAn4I/JFzbigVtSVz2jHuGz0dK6lN5j2WdA1mdhPQClziaUXjm7DW+EDkn4Gb01XQOJL5mhYQa+tcSuw3pmfM7Azn3DGPaxstmVo/CPybc+7rZnYh8MN4ren6WUpWJvw8TSrtge+cWzXeY2Z2yMxmOecOxINyrF81LwQuMrPbgAqgyMx6nHPjXkTzqU7MrAp4CPhC/Ne8dElmA/nEMfvMrIDYr8sT/crqhaQ2ujezVcReaC9xzg2kqbbRJqu1EjgDeDI+EJkJPGBm1znn1qatyuT/719wzg0Cu8xsG7EXgJfTU+JxdUxW6y3AVQDOuefNrITY+jV+taHGk9T3su/8vogw6sLH1zj+Yug/TnL8zfhz0XbSOom1cB4HPuVDfQXATmAB71wMO33UMbdz/EXb/8jQOlcQa4UtSnd9U6111PFP4s9F22S+plcB98Zv1xNrRdRlaK2/AW6O315KLETNp++BFsa/aHsNx1+0fcmPGif9N/hdwKgvWl08JLfHP8+I398K3DPG8X4F/qR1AjcBg8D6ER/L01jj1cAb8bD8fPy+vwGui98uAf4T2AG8BJzi0//5ZHU+Bhwa8TV8wMfvzwlrHXWsL4Gf5NfUgDuAzcBG4MZM/ZoSm5nzXPzFYD1wpU91/oTYTLtBYqP5W4CPAR8b8TW9O/7v2OjX//1kH3qnrYhIQGTaLB0REfGIAl9EJCAU+CIiAaHAFxEJCAW+iEhAKPBFRAJCgS8iEhAKfBGRgPj/cv5/TBaOjXsAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "pd.Series(predictions_test).plot(kind='density')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oof_df = pd.DataFrame()\n",
    "oof_df['patient_id'] = comp_train['patient_id']\n",
    "oof_df[TARGET_COL] = oof\n",
    "oof_df.to_csv(OUTPUT_PATH + 'oofs_lgbm.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>patient_id</th>\n",
       "      <th>outcome_flag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>patient_2</td>\n",
       "      <td>0.226579</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>patient_3</td>\n",
       "      <td>0.161460</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>patient_5</td>\n",
       "      <td>0.445144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>patient_8</td>\n",
       "      <td>0.027795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>patient_14</td>\n",
       "      <td>0.139229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>patient_15</td>\n",
       "      <td>0.090780</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>patient_16</td>\n",
       "      <td>0.079721</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>patient_33</td>\n",
       "      <td>0.140987</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>patient_38</td>\n",
       "      <td>0.060210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>patient_41</td>\n",
       "      <td>0.568861</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   patient_id  outcome_flag\n",
       "0   patient_2      0.226579\n",
       "1   patient_3      0.161460\n",
       "2   patient_5      0.445144\n",
       "3   patient_8      0.027795\n",
       "4  patient_14      0.139229\n",
       "5  patient_15      0.090780\n",
       "6  patient_16      0.079721\n",
       "7  patient_33      0.140987\n",
       "8  patient_38      0.060210\n",
       "9  patient_41      0.568861"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub_df = pd.DataFrame()\n",
    "sub_df[ID_COL] = comp_test[ID_COL]\n",
    "sub_df[TARGET_COL] = predictions_test\n",
    "sub_df[[TARGET_COL]].to_excel(OUTPUT_PATH +'preds_lgbm.xlsx', index=False)\n",
    "sub_df.head(10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
